transformers 文書分類のrun glue.pyに渡すlivedoorニュースコーパスの前処理
--train_file, --validation_file,--test_fileを指定できるようにJSONを作る
以下のスクリプトで3つのJSONファイルを作る
code:preprocess.py
import argparse
import glob
from pathlib import Path
import jsonlines
from sklearn.model_selection import train_test_split
from tqdm import tqdm
categories = [
"dokujo-tsushin",
"it-life-hack",
"kaden-channel",
"livedoor-homme",
"movie-enter",
"peachy",
"smax",
"sports-watch",
"topic-news",
]
def save_datasets(datasets, path):
with jsonlines.open(path, "w") as writer:
writer.write_all(datasets)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("text_dir_path")
parser.add_argument("output_root_path", type=Path)
args = parser.parse_args()
datasets = []
for category in tqdm(categories):
for path in glob.glob(f"{args.text_dir_path}/{category}/{category}*"):
with open(path) as f:
lines = f.read().splitlines()
text = "\n".join(lines3:) # 記事の本文を取り出す datasets.append({"text": text, "label": category})
args.output_root_path.mkdir(parents=True, exist_ok=True)
train_datasets, other_datasets = train_test_split(
datasets, train_size=0.6, random_state=42
)
val_datasets, test_datasets = train_test_split(
other_datasets, test_size=0.5, random_state=42
)
save_datasets(train_datasets, args.output_root_path / "train.json")
save_datasets(val_datasets, args.output_root_path / "val.json")
save_datasets(test_datasets, args.output_root_path / "test.json")